install.packages("tidyverse")
install.packages("readr")
install.packages("writexl")
install.packages("Rcpp")
install.packages("ggrepel")
install.packages("margins")
install.packages("stargazer")
install.packages("plm")
install.packages("panelView")
install.packages("pcse")
install.packages("texreg")
install.packages("panelAR")


library(ggplot2)
library(readr)
library(writexl)
library(tidyverse)
library(dplyr)
library(broom)
library(ggrepel)
library(margins)
library(stargazer)
library(ggpubr)
library(plm)
library(lmtest)
library(haven)
library(panelView)
library(pcse)
library(texreg)
library(panelAR)
library(survival)
library(lubridate)
library(ggsurvfit)
library(gtsummary)
library(tidycmprsk)

#***TIME-SERIES CROSS-SECTION
ICTdiffusion <- read.csv("ICTdiffusionpanelG20Dec2023.csv", check.names = FALSE)
ICTdiffusion$percapitaGDP <- (1000000*ICTdiffusion$cgdpe)/ICTdiffusion$population

ICTpaneldatabalanced <- make.pbalanced(ICTdiffusion, balance.type = "fill")
ICTpaneldatabalanced$CSunivper100k = (100000*(ICTpaneldatabalanced$CSuniversities3yraverage))/ICTpaneldatabalanced$population

ICTpaneldatabalanced$logCSunivper100k = log((ICTpaneldatabalanced$CSunivper100k)+.001)

ICTpaneldatabalanced$logpercapitaGDP = log(ICTpaneldatabalanced$percapitaGDP)
ICTpaneldatabalanced$logpopulation = log(ICTpaneldatabalanced$population)
ICTpaneldatabalanced$logmilexpbyGDP = log(ICTpaneldatabalanced$milexpbyGDP + .1)

#visualizing distribution of independent variable SE skill infrastructure by country
subset2007 <- filter(ICTpaneldatabalanced, year == "2007")
ggplot(subset2007, aes(x=reorder(country, -CSunivper100k), y=CSunivper100k)) +
  theme(axis.title.y = element_blank())+
  ylab("Universities that meet software engineering quality baseline per 100,000 people") +
  coord_flip()+
  geom_bar(position="dodge", stat="identity")

#visualizing bivariate relationships
panelview(comphousepercent ~ CSunivper100k,
          data = ICTpaneldatabalanced, index = c("country","year"), type = "bivar", cex.main =12,
          ylab = "computerization rate", xlab = "software engineering univ. per 100,000 people",
          lwd = .8)

#panel analysis
panelARmodel <- panelAR(comphousepercent ~ logCSunivper100k + logpercapitaGDP + 
                          logpopulation, 
                        data = ICTpaneldatabalanced, 
                        panelVar = "country", 
                        timeVar = "year",
                        autoCorr = "ar1",
                        panelCorrMethod = "pcse")

summary(panelARmodel)

panelARmodel <- panelAR(comphousepercent ~ logCSunivper100k + logpercapitaGDP + 
                          logpopulation + polity, 
                        data = ICTpaneldatabalanced, 
                        panelVar = "country", 
                        timeVar = "year",
                        autoCorr = "ar1",
                        panelCorrMethod = "pcse")

summary(panelARmodel)

panelARmodel <- panelAR(comphousepercent ~ logCSunivper100k + logpercapitaGDP + 
                          logpopulation +  polity + logmilexpbyGDP + LMEdummy, 
                        data = ICTpaneldatabalanced, 
                        panelVar = "country", 
                        timeVar = "year",
                        autoCorr = "ar1",
                        panelCorrMethod = "pcse")

summary(panelARmodel)

#additional controls: urbanization and trade openness and regional dummies
panelARmodel <- panelAR(comphousepercent ~ logCSunivper100k + logpercapitaGDP + 
                          logpopulation +  polity + logmilexpbyGDP + LMEdummy + tradeopenness + urbanpercent, 
                        data = ICTpaneldatabalanced, 
                        panelVar = "country", 
                        timeVar = "year",
                        autoCorr = "ar1",
                        panelCorrMethod = "pcse")
summary(panelARmodel)

panelARmodel <- panelAR(comphousepercent ~ logCSunivper100k + logpercapitaGDP + 
                          logpopulation +  polity + logmilexpbyGDP + LMEdummy + 
                          EastAsiaPacific + EuropeCentralAsia + LatinAmericaCaribbean + 
                          MiddleEastNorthAfrica + SouthAsia + SubsaharanAfrica, 
                        data = ICTpaneldatabalanced, 
                        panelVar = "country", 
                        timeVar = "year",
                        autoCorr = "ar1",
                        panelCorrMethod = "pcse")
summary(panelARmodel)

#alternative IV specification - researchers

ICTpaneldatabalanced$CSresearcherper100k = (100000*(ICTpaneldatabalanced$SEresearcheraverage))/ICTpaneldatabalanced$population

ICTpaneldatabalanced$logCSresearcherper100k = log((ICTpaneldatabalanced$CSresearcherper100k)+.01)


altIVmodel1 <- panelAR(comphousepercent ~ logCSresearcherper100k + logpercapitaGDP + 
                          logpopulation +  polity + logmilexpbyGDP + LMEdummy, 
                        data = ICTpaneldatabalanced, 
                        panelVar = "country", 
                        timeVar = "year",
                        autoCorr = "ar1",
                        panelCorrMethod = "pcse")
summary(altIVmodel1)

altIVmodel2 <- panelAR(comphousepercent ~ logCSresearcherper100k + logpercapitaGDP + 
                          logpopulation +  polity + logmilexpbyGDP + LMEdummy + tradeopenness + urbanpercent, 
                        data = ICTpaneldatabalanced, 
                        panelVar = "country", 
                        timeVar = "year",
                        autoCorr = "ar1",
                        panelCorrMethod = "pcse")
summary(altIVmodel2)

altIVmodel3 <- panelAR(comphousepercent ~ logCSresearcherper100k + logpercapitaGDP + 
                          logpopulation +  polity + logmilexpbyGDP + LMEdummy + 
                          EastAsiaPacific + EuropeCentralAsia + LatinAmericaCaribbean + 
                          MiddleEastNorthAfrica + SouthAsia + SubsaharanAfrica, 
                        data = ICTpaneldatabalanced, 
                        panelVar = "country", 
                        timeVar = "year",
                        autoCorr = "ar1",
                        panelCorrMethod = "pcse")
summary(altIVmodel3)

#***DURATION ANALYSIS

durationdata <- read.csv("ICTdiffusiondurationDec2023.csv", check.names = FALSE)

durationdata$CSunivper100k = (100000*(durationdata$CSuniversities1995threeyear))/durationdata$pop1995
durationdata$logCSunivper100k = log((durationdata$CSunivper100k)+.001)
durationdata$logGDPpc1995 = log(durationdata$percapitaGDP1995)
durationdata$logpop1995 = log(durationdata$pop1995)
durationdata$logmilexpbyGDP1995 = log(durationdata$milexpbyGDP1995 + .1)
durationdata$logtradeopenness1995 = log(durationdata$tradeopenness1995)
durationdata$urbanization1995 <- as.numeric(durationdata$urbanization1995)

durationdata$logurbanization1995 = log(durationdata$urbanization1995)

subsetdurationdata <- filter(durationdata, avgpercapitaGDP>3596 & Unmembership == 1)

#Cox regression model

coxregression25 <- coxph(Surv(Time_to_computerization_25percent, Status_25percent) ~ 
                           logCSunivper100k + logGDPpc1995 + logpop1995 + polity1995 + logmilexpbyGDP1995 + LMEdummy, data = subsetdurationdata)

summary(coxregression25)
coxregression20 <- coxph(Surv(Time_to_computerization_20percent, Status_20percent) ~ 
                           logCSunivper100k + logGDPpc1995 + logpop1995 + polity1995 + logmilexpbyGDP1995 + LMEdummy, data = subsetdurationdata)

summary(coxregression20)

stargazer(coxregression25, coxregression20, type = "text", 
          title = "Table X: Cox Proportional Hazards Model", out = "tab-initialregression-quantextensionchapter.htm")

#additional controls for duration analysis
coxregressioncontrols <- coxph(Surv(Time_to_computerization_25percent, Status_25percent) ~ 
                           logCSunivper100k + logGDPpc1995 + logtradeopenness1995 + logurbanization1995 +
                             EastAsiaPacific + EuropeCentralAsia + LatinAmericaCaribbean + MiddleEastNorthAfrica + SouthAsia + SubsaharanAfrica
                           , data = subsetdurationdata)

summary(coxregressioncontrols)
stargazer(coxregressioncontrols,  type = "text", 
          title = "Table X: Cox Proportional Hazards Model", out = "tab-initialregression-quantextensionchapter.htm")

test.ph = cox.zph(coxregression20)
test.ph

test.ph = cox.zph(coxregression25)
test.ph

test.ph = cox.zph(coxregressioncontrols)
test.ph


#***CROSS SECTION

ICTdiffusionprereg <- read.csv("ICTdiffusioncrosssectionDec2023.csv", check.names = FALSE)

ICTdiffusionprereg$logavgGDPpc = log(ICTdiffusionprereg$avgpercapitaGDP)
ICTdiffusionprereg$logavgpop = log(ICTdiffusionprereg$avgpop)
ICTdiffusionprereg$logavgmilexpbyGDP = log(ICTdiffusionprereg$avgmilexpbyGDP + .1)

ICTdiffusionprereg$CSunivper100k = (100000*(ICTdiffusionprereg$CSuniversities20072008))/ICTdiffusionprereg$avgpop
ICTdiffusionprereg$logCSunivper100k = log((ICTdiffusionprereg$CSunivper100k)+.001)

#Operationalizing scope conditions

subsetICTdiffusionprereg <- filter(ICTdiffusionprereg, avgpercapitaGDP>3596 & UNmembership == 1)

#Baseline regression model - incremental inclusion of controls across three models

reg_baseline1 <- lm(avgcomphousepercent ~ logCSunivper100k + logavgGDPpc + logavgpop, data = subsetICTdiffusionprereg)
summary(reg_baseline1)
reg_baseline2 <- lm(avgcomphousepercent ~ logCSunivper100k + logavgGDPpc + logavgpop + avgpolity, data = subsetICTdiffusionprereg)
summary(reg_baseline2)
reg_baseline3 <- lm(avgcomphousepercent ~ logCSunivper100k + logavgGDPpc + logavgpop + avgpolity + logavgmilexpbyGDP + LMEdummy, data = subsetICTdiffusionprereg)
summary(reg_baseline3)

#additional controls -- trade openness and regional dummies
ICTdiffusionprereg$CSunivper100k = (100000*(ICTdiffusionprereg$CSuniversities20072008))/ICTdiffusionprereg$avgpop
ICTdiffusionprereg$logCSunivper100k = log((ICTdiffusionprereg$CSunivper100k)+.001)
ICTdiffusionprereg$logtrade_openness = log(ICTdiffusionprereg$avgtradeopenness)
ICTdiffusionprereg$logavgurbanization = log(ICTdiffusionprereg$avgurbanization)

d <- density(ICTdiffusionprereg$avgurbanization, na.rm = TRUE)
plot(d)

subsetICTdiffusionprereg <- filter(ICTdiffusionprereg, avgpercapitaGDP>3596 & UNmembership == 1)

reg_ICTdiffusion_subset <- lm(avgcomphousepercent ~ logCSunivper100k + logavgGDPpc + logavgpop + avgpolity + logavgmilexpbyGDP + LMEdummy + logtrade_openness + logavgurbanization, data = subsetICTdiffusionprereg)
summary(reg_ICTdiffusion_subset)

reg_ICTdiffusion_regional <- lm(avgcomphousepercent ~ logCSunivper100k + logavgGDPpc + logavgpop + avgpolity + logavgmilexpbyGDP + LMEdummy 
                                + EastAsiaPacific + EuropeCentralAsia + LatinAmericaCaribbean + MiddleEastNorthAfrica + SouthAsia + SubsaharanAfrica, data = subsetICTdiffusionprereg)
summary(reg_ICTdiffusion_regional)

#reverse causality and endogeneity
ICTdiffusionprereg$CSunivper100k = (100000*(ICTdiffusionprereg$CSuniversities1995))/ICTdiffusionprereg$avgpop

ICTdiffusionprereg$logCSunivper100k = log((ICTdiffusionprereg$CSunivper100k)+.001)
subsetICTdiffusionprereg <- filter(ICTdiffusionprereg, avgpercapitaGDP>3596 & UNmembership == 1)

reg_ICTdiffusion_subset <- lm(avgcomphousepercent ~ logCSunivper100k + logavgGDPpc + logavgpop + avgpolity + logavgmilexpbyGDP + LMEdummy, data = subsetICTdiffusionprereg)

#LS model - computer exports and ICT patents

subsetICTdiffusionprereg$logcompexportstotal = log(subsetICTdiffusionprereg$compexports1996)

reg_compexportstotal <- lm(avgcomphousepercent ~ logCSunivper100k + logavgGDPpc + logavgpop + avgpolity + logavgmilexpbyGDP + LMEdummy + logcompexportstotal, data = subsetICTdiffusionprereg)
summary(reg_compexportstotal)

subsetICTdiffusionprereg$logICTpatents = log((subsetICTdiffusionprereg$ICTpatents1995) + .001)
reg_patentstotal <- lm(avgcomphousepercent ~ logCSunivper100k + logavgGDPpc + logavgpop + avgpolity + logavgmilexpbyGDP + LMEdummy + logICTpatents, data = subsetICTdiffusionprereg)
summary(reg_patentstotal)

#***APPENDIX

corranalysis <- read.csv("OECD Survey on ICT Usage by Businesses Persons Employed Using Computer in Work.csv", check.names = FALSE)

exceltransfer <- corranalysis %>% group_by(Country) %>% 
  summarise(avgcomphousepercent = mean(comphousepercent, na.rm = TRUE),
            avgcompuse_personsemployed = mean(compuse_personsemployed, na.rm = TRUE),
  )

cor(exceltransfer$avgcomphousepercent,exceltransfer$avgcompuse_personsemployed, method = c("pearson"))
cor.test(exceltransfer$avgcomphousepercent,exceltransfer$avgcompuse_personsemployed, method = c("pearson"))

ggplot(data = exceltransfer, aes (x = avgcomphousepercent, y = avgcompuse_personsemployed))+
  geom_point() +
  geom_smooth(method = 'lm') +
  ylab("Businesses with employees that use computers (percent)") + xlab("Households with computer (percent)") +
  stat_cor(method = "pearson")
